import os
import re
import time

import urllib3
from downloader import download

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def get_site(url):
    return re.findall(r'(https{0,1}://.*?)/', url)[0]


def extract_book_content(url):
    resp = download(url)
    lines = resp.xpath('//div[@id="Lab_Contents"]/p/text()').extract()
    contents = list(map(lambda x: x.replace('\u3000', ""), lines))
    try:
        next_page = resp.xpath('//a[contains(text(),"下一页")]/@href').extract()[0]
        if next_page:
            next_page_url = get_site(resp.url) + next_page
            contents += extract_book_content(next_page_url)
    except:
        pass
    lines = []
    for line in contents:
        if not ('请大家收藏' in line or '请点击下一页' in line):
            lines.append(line)
    return lines


def extract_book_page(book_url):
    resp = download(book_url)
    chapters = []
    # bookname = resp.xpath('//h1/text()').extract()[0]
    lis = resp.xpath('//a[contains(text(),"第")]/..')
    for i, li in enumerate(lis):
        title = li.xpath('a/text()').extract_first()
        url = li.xpath('a/@href').extract_first()
        order = re.findall(r'(\d+).html', url)[0]
        url = get_site(resp.url) + url
        chapters.append({
            'title': title,
            'url': url,
            'order': i
        })
    return 'mxd',chapters

def download_book(directory, book_url):
    bookname, chapters = extract_book_page(book_url)
    filename = f'{bookname}.txt'
    with open(f'{dirctory}/{filename}', 'w', encoding='utf-8') as f:
        for c in chapters:
            lines = extract_book_content(c['url'])
            if lines:
                print(f'{c['title']} 爬取成功')
            else:
                print(f'{c['title']} 爬取失败')

            f.write(f'\n\n{c["title"]}\n')
            for line in lines:
                f.write('\n\t'+line + '\n')
            time.sleep(0.1)




if __name__ == '__main__':
    dirctory = './books'
    if not os.path.exists(dirctory):
        os.mkdir(dirctory)
    url = """
        https://www.mxdzw.com/Chapter/damingwanghou_67970.html
    """.strip()
    download_book(dirctory, url)